Packages To Be Used
library(anomalize)
library(dplyr)
library(FinTS)
library(forecast)
library(fpp2)
library(fUnitRoots)
library(ggplot2)
library(gridExtra)
library(hrbrthemes)
library(lmtest)
library(lubridate)
library(pdR)
library(plotly)
library(RColorBrewer)
library(tibbletime)
library(tidyverse)
library(timetk)
library(TSA)
library(tseries)
library(xts)
setwd("C:/Users/Cesur/Desktop")
data <- read.csv("Project.csv")
data <- data %>% select(Tarih,Açılış)
data$Tarih <- as.Date(data$Tarih,format="%d.%m.%Y")
data <- data[order(data$Tarih,decreasing = FALSE),]
data[,2] <- as.numeric(gsub(",", ".", data[,2]))
dim(data)
## [1] 109 2
The data consists of 7 columns and 109 rows. First column is Date. The others are continous variables.
head(data)
## Tarih Açılış
## 109 2014-12-01 9.41
## 108 2015-01-01 9.60
## 107 2015-02-01 9.28
## 106 2015-03-01 9.08
## 105 2015-04-01 8.62
## 104 2015-05-01 8.85
tail(data)
## Tarih Açılış
## 6 2023-07-01 203.5
## 5 2023-08-01 232.4
## 4 2023-09-01 245.1
## 3 2023-10-01 242.5
## 2 2023-11-01 220.3
## 1 2023-12-01 255.5
sum(is.na(data))
## [1] 0
summary(data)
## Tarih Açılış
## Min. :2014-12-01 Min. : 4.82
## 1st Qu.:2017-03-01 1st Qu.: 8.94
## Median :2019-06-01 Median : 12.75
## Mean :2019-06-01 Mean : 35.12
## 3rd Qu.:2021-09-01 3rd Qu.: 17.45
## Max. :2023-12-01 Max. :255.50
I will concentrate on Açılış columnn on my project.
The source of data is https://tr.investing.com/equities/turk-hava-yollari-historical-data.
Açılış variable varies with price of 4.820 ₺ to 255.50 ₺ Its mean is 35.12 ₺.
serhat <- data %>%
ggplot(aes(x= Tarih, y= Açılış)) +
geom_area(fill="blue", alpha=0.5) +
geom_line(color="black") +
ylab("THYAO Price") +
labs(title="Turkish Airlines Price (THYAO) from 2014-12 to 2023-12")+
theme_ipsum()
serhat <- ggplotly(serhat)
serhat
It is seen that there is an increasing trend at the graph. There is no seasonality in the data.
Spliting Data
datap <- ts(data[,2],start = c(2014,12),end=c(2023,12),frequency = 12)
train <- ts(datap[1:97], start = c(2014, 12), frequency = 12)
test <- ts(datap[98:109],start=c(2023,1),end=c(2023,12),frequency = 12)
We split our data as train and test. Test shows that last 12 observation of the data.
Anomalize Detection
df <- data[1:97,]
df <- as_tibble(df)
# df %>%
#time_decompose(Açılış, method = "stl", frequency = "auto", trend = "auto") %>%
#anomalize(remainder,method = "gesd", alpha = 0.05, max_anoms = 0.2) %>%
#plot_anomaly_decomposition()
Box-Cox Transformation
BoxCox.ar(train,method = c("ols"))
lambda1 <- BoxCox.lambda(train)
lambda1
## [1] -0.4406391
train_box <- BoxCox(train,lambda1)
autoplot(train_box,"Transformed Time Series Plot Of THYAYO Price",color="darkblue") + theme_minimal()
We did Box-Cox transformation because lambda value equals to -0.44 which is not equal to 0. The series is not still stationary. because there is increasing and decreasing trend in series.
library(RColorBrewer)
rb<-brewer.pal(7,"Blues")
par(mfrow=c(1,2))
plot(train, col=rb[7], xlab = "Original Series", ylab="THYAO Price")
plot(train_box, col=rb[4], xlab = "Differenced Series", ylab="THYAO Price")
Original VS Transformed
p1<-ggAcf(train_box,lag.max = 96) + theme_minimal() + ggtitle("ACF Of Transformed Data")
p2<-ggPacf(train_box,lag.max = 96)+ theme_minimal() + ggtitle("PACF Of Transformed Data")
grid.arrange(p1,p2,ncol=2)
As it is seen that, there is a slow decay in ACF graph, meaning that the series is not stationary. There is no need to check PACF.
Unit - Root Tests
kpss.test(train_box,null="Level")
## Warning in kpss.test(train_box, null = "Level"): p-value smaller than printed
## p-value
##
## KPSS Test for Level Stationarity
##
## data: train_box
## KPSS Level = 1.4779, Truncation lag parameter = 3, p-value = 0.01
kpss.test(train_box,null="Trend")
##
## KPSS Test for Trend Stationarity
##
## data: train_box
## KPSS Trend = 0.15655, Truncation lag parameter = 3, p-value = 0.0412
The p - value of KPSS Test for Level Stationary is 0.01 < 0.05. Therefore, we reject H0 and we can conclude that the series is not stationary. In addition, The p - value of KPSS Test for Trend Stationary is 0.0412 < 0.05. Hence, we can say that the series has stochastic trend. We need to take difference to remove stochastic trend.
pp.test(train_box)
##
## Phillips-Perron Unit Root Test
##
## data: train_box
## Dickey-Fuller Z(alpha) = -3.9635, Truncation lag parameter = 3, p-value
## = 0.8868
## alternative hypothesis: stationary
Since p value is greater than α=0.05 , we fail to reject H0 . It means that we don’t have enough evidence to claim that we have a stationary system.
mean(train_box)
## [1] 1.499884
The mean of stock prices is not 0 or close to 0. In such cases, we will prefer to use ADF test with c
adfTest(train_box, lags=1, type="c")
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 1
## STATISTIC:
## Dickey-Fuller: 0.603
## P VALUE:
## 0.9888
##
## Description:
## Sun Jan 21 03:18:54 2024 by user: Cesur
Since p value is greater than α=0.05 , we fail to reject H0 = The series is not stationary It means that the series is not stationary.
adfTest(train_box, lags=1, type="ct")
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 1
## STATISTIC:
## Dickey-Fuller: -1.1308
## P VALUE:
## 0.9136
##
## Description:
## Sun Jan 21 03:18:54 2024 by user: Cesur
Since p value is greater than α=0.05 , we fail to reject H0. It means that we have non stationary system having stochastic trend.
ndiffs(train_box)
## [1] 1
It is enough to take 1 difference for the series to make stationary.
diff_train <- diff(train_box)
It seems stationary after taking difference. Let’s check it with unit root tests.
kpss.test(diff_train,null="Level")
##
## KPSS Test for Level Stationarity
##
## data: diff_train
## KPSS Level = 0.35046, Truncation lag parameter = 3, p-value = 0.09851
P - value of KPSS test is 0.09851 which is higher than 0.05, so we cannot reject null hypothesis (H0) and we can conclude that the differenced series are stationary.
adfTest(diff_train, type="nc")
## Warning in adfTest(diff_train, type = "nc"): p-value smaller than printed
## p-value
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 1
## STATISTIC:
## Dickey-Fuller: -5.9011
## P VALUE:
## 0.01
##
## Description:
## Sun Jan 21 03:18:54 2024 by user: Cesur
It is enough to check adftest with “nc” because we made the constant part zero. Since its p value < 0.05, the series is stationary.
pp.test(diff_train)
## Warning in pp.test(diff_train): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: diff_train
## Dickey-Fuller Z(alpha) = -100.65, Truncation lag parameter = 3, p-value
## = 0.01
## alternative hypothesis: stationary
Since p value is less than α, we reject H0. We can conclude that the differenced series is stationary.
autoplot(diff_train,main="Time Series Plot of the One - Differenced Data",xlab="THYAO Price",col="blue")
p3<-ggAcf(diff_train,lag.max = 96) + theme_minimal() + ggtitle("ACF Of One - Differenced Data")
p4<-ggPacf(diff_train,lag.max = 96)+ theme_minimal() + ggtitle("PACF Of One - Differenced Data")
grid.arrange(p3,p4,ncol=2)
The series become stationary after taking one difference. Since there is no seasonality in the data, there is no need to check HEGY.TEST. One differencing is enough. I will suggest a model according to results of ACP-PACF graphs.
We can suggest \(ARIMA(0,1,0)\) called as random walk. We can also suggest \(ARIMA(7,1,7)\) .
fit1 <-Arima(train_box,order = c(0,1,0), seasonal = c(0, 0,0))
fit1
## Series: train_box
## ARIMA(0,1,0)
##
## sigma^2 = 0.001657: log likelihood = 171.11
## AIC=-340.22 AICc=-340.18 BIC=-337.65
fit2 <-Arima(train_box,order = c(7,1,7), seasonal = c(0, 0,0))
fit2
## Series: train_box
## ARIMA(7,1,7)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ma1
## -0.0576 0.3741 -0.0536 0.1110 0.4824 -0.1187 -0.4888 0.0751
## s.e. 0.1424 0.1320 0.1244 0.1194 0.1190 0.1324 0.1324 0.1116
## ma2 ma3 ma4 ma5 ma6 ma7
## -0.3469 -0.0249 0.0250 -0.4171 0.1724 0.948
## s.e. 0.1054 0.1017 0.1026 0.1007 0.1101 0.122
##
## sigma^2 = 0.001361: log likelihood = 182.52
## AIC=-335.03 AICc=-329.03 BIC=-296.56
Since \(\phi_7\) and \(\theta_7\) are significant, \(ARIMA(7,1,7)\) is appropriate.
a<-fit1$aic
b<-fit1$bic
c<-fit2$aic
d<-fit2$bic
cat("Model 1 AIC:", a, ", Model 1 BIC:", b, "\n")
## Model 1 AIC: -340.2178 , Model 1 BIC: -337.6535
cat("Model 2 AIC:", c, ", Model 2 BIC:", d, "\n")
## Model 2 AIC: -335.0301 , Model 2 BIC: -296.5649
Since AIC and BIC of fit 1 is smaller than fit 2, we choose \(ARIMA(0,1,0)\).
auto.arima(train_box)
## Series: train_box
## ARIMA(0,1,0)
##
## sigma^2 = 0.001657: log likelihood = 171.11
## AIC=-340.22 AICc=-340.18 BIC=-337.65
Also, auto.arima suggests ARIMA (0,1,0) for the series.
Diagnostic Checking
Jarque Bera and Shapiro Test
r1=resid(fit1)/sd(residuals(fit1))
jarque.bera.test(r1)
##
## Jarque Bera Test
##
## data: r1
## X-squared = 0.70911, df = 2, p-value = 0.7015
shapiro.test(r1)
##
## Shapiro-Wilk normality test
##
## data: r1
## W = 0.99064, p-value = 0.735
Since p value is bigger than alpha(0.05) , we reject H0. Hence, Normality Assumption is satisfied.
Let’s check the QQ - Plot of it.
ggplot(r1, aes(sample = r1)) +stat_qq()+geom_qq_line(col="red")+ggtitle("QQ Plot of the Residuals")+theme_minimal()
Breusch-Godfrey Test
m = lm(r1 ~ 1+zlag(r1))
bgtest(m,order=12)
##
## Breusch-Godfrey test for serial correlation of order up to 12
##
## data: m
## LM test = 14.689, df = 12, p-value = 0.2589
Since p - value is greater than alpha(0.05), we have 95% confident that the residuals of the model are uncorrelated, according to the result of Breusch-Godfrey Test.
ARCH Engle’s Test
ArchTest(r1)
##
## ARCH LM-test; Null hypothesis: no ARCH effects
##
## data: r1
## Chi-squared = 9.6194, df = 12, p-value = 0.6493
Since p value = 0.6493 > 0.05 , we fail to reject H0. Hence, we can say that there is no presence of ARCH effects. Therefore, e do not need to use ARCH and GARCH model.
Forecasting
Back - Transformation
f<-forecast(fit1,h=12)
accuracy(f,test)
## ME RMSE MAE MPE MAPE
## Training set 0.0059151 0.04049819 0.03240554 0.3116619 2.178828
## Test set 182.7450093 189.58227554 182.74500933 98.8327941 98.832794
## MASE ACF1 Theil's U
## Training set 0.203766 0.01137959 NA
## Test set 1149.100252 0.76983534 6.670223
accuracy(f,test)[1,7]
## [1] 0.01137959
Forecast values are produced with respect to transformed data here. We should apply Back-Transformation.
back<-exp(f$mean)
accuracy(back,test)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 177.3773 184.4137 177.3773 95.69492 95.69492 0.7698353 6.468543
het <- resid(fit1)/sd(residuals(fit1))
hete <- het^2
g1<-ggAcf(as.vector(hete),lag.max = 72)+theme_minimal()+ggtitle("ACF of Squared Residuals")
g2<-ggPacf(as.vector(hete),lag.max = 72)+theme_minimal()+ggtitle("PACF of Squared Residuals")
grid.arrange(g1,g2,ncol=2)
Homoscedasticity is checked in the code above. Almost all spikes are in of the white noise bands that is an indication of homoscedasticity.
Modelling
ETS
fitnew=ets(train,model="ZZZ")
fitnew
## ETS(M,A,N)
##
## Call:
## ets(y = train, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.7781
## beta = 0.1759
##
## Initial states:
## l = 9.6632
## b = -0.3211
##
## sigma: 0.1365
##
## AIC AICc BIC
## 550.3300 550.9893 563.2035
Fr1=forecast(fitnew,h=12)
autoplot(Fr1) + autolayer(fitted(Fr1),series="fitted")+theme_minimal()
Normality Checking for ETS
r=resid(fitnew)/sd(residuals(fitnew))
jarque.bera.test(r)
##
## Jarque Bera Test
##
## data: r
## X-squared = 1.5811, df = 2, p-value = 0.4536
Since p - value is bigger than alpha (0.05) , we reject H0. Hence, Normality Assumption is satisfied.
Accuracy Test
accuracy(Fr1,test)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.7749416 4.027892 1.984933 1.54562 10.01209 0.2166595
## Test set -18.2301631 30.339217 23.286348 -12.63560 15.24538 2.5417533
## ACF1 Theil's U
## Training set 0.08253673 NA
## Test set 0.50447244 1.421913
TBATS
tbatsmodel<-tbats(train)
tbatsmodel
## BATS(0, {0,0}, 1, -)
##
## Call: tbats(y = train)
##
## Parameters
## Lambda: 0
## Alpha: 0.801937
## Beta: 0.152849
## Damping Parameter: 1
##
## Seed States:
## [,1]
## [1,] 2.36459508
## [2,] 0.05025062
## attr(,"lambda")
## [1] 7.52242e-09
##
## Sigma: 0.1270689
## AIC: 545.1156
autoplot(train,main="TS plot of Train with TBATS Fitted") + autolayer(fitted(tbatsmodel), series="Fitted") +theme_minimal()
tbats_forecast<-forecast(tbatsmodel,h=12)
autoplot(tbats_forecast)+theme_minimal()
Normality Checking for TBATS
rr=resid(tbatsmodel)/sd(residuals(tbatsmodel))
jarque.bera.test(rr)
##
## Jarque Bera Test
##
## data: rr
## X-squared = 0.1476, df = 2, p-value = 0.9289
Since p - value is bigger than alpha (0.05) , we reject H0. Hence, Normality Assumption is satisfied.
Accuracy Test
accuracy(tbats_forecast,test)
## ME RMSE MAE MPE MAPE
## Training set 0.351754 3.558368 1.941747 -0.02715223 10.26801
## Test set -231.990450 299.177908 231.990450 -112.82190062 112.82190
## MASE ACF1 Theil's U
## Training set 0.2119458 -0.1697709 NA
## Test set 25.3222404 0.6908177 9.343687
NNETAR
nnet_for <- nnetar(train,P=0)
nnet_for
## Series: train
## Model: NNAR(1,1)
## Call: nnetar(y = train, P = 0)
##
## Average of 20 networks, each of which is
## a 1-1-1 network with 4 weights
## options were - linear output units
##
## sigma^2 estimated as 12.59
fcast <- forecast(nnet_for,h=12)
autoplot(fcast) + autolayer(fitted(fcast),series="fitted")+theme_minimal()
## Warning: Removed 1 row containing missing values (`geom_line()`).
Normality Checking for NNETAR
shapiro.test(residuals(nnet_for))
##
## Shapiro-Wilk normality test
##
## data: residuals(nnet_for)
## W = 0.75404, p-value = 2.21e-11
Since p - value is smaller than alpha (0.05) , we reject H0. Hence, Normality Assumption is not satisfied.
Accuracy Test
accuracy(fcast,test)
## ME RMSE MAE MPE MAPE MASE
## Training set 4.140114e-05 3.547983 1.921649 -1.391268 10.32818 0.209752
## Test set 2.635436e+01 54.588206 45.558700 7.849195 22.72422 4.972827
## ACF1 Theil's U
## Training set -0.4650233 NA
## Test set 0.7700396 1.766325
Comparing Train
etsac <- accuracy(Fr1,test)[1, 1:5]
tbatsac <- accuracy(tbats_forecast,test)[1, 1:5]
nnetarac <- accuracy(fcast,test)[1, 1:5]
arimaac <- accuracy(f,test)[1, 1:5]
etsac <- as.data.frame(etsac)
nnetarac <- as.data.frame(nnetarac)
tbatsac <- as.data.frame(tbatsac)
arimaac <- as.data.frame(arimaac)
ets7 <- accuracy(Fr1, test)[1,7]
tbats7 <- accuracy(tbats_forecast,test)[1,7]
nnetar7 <- accuracy(fcast,test)[1,7]
arima7 <- accuracy(f,test)[1,7]
ets7 <- as.data.frame(ets7)
tbats7 <- as.data.frame(tbats7)
nnetar7 <- as.data.frame(nnetar7)
arima7 <- as.data.frame(arima7)
acf_train <- cbind(etsac,nnetarac,tbatsac,arimaac)
acf7 <- cbind(ets7,tbats7,nnetar7,arima7)
colnames(acf7) <- c("etsac","nnetarac","tbatsac","arimaac")
accuracycompare <- rbind(acf_train, acf7)
colnames(accuracycompare) <- c("ETS","TBATS","NNETAR","ARIMA")
rownames(accuracycompare) <- c('ME', 'RMSE', 'MAE', 'MPE', 'MAPE', 'ACF1')
round(accuracycompare,2)
## ETS TBATS NNETAR ARIMA
## ME 0.77 0.00 0.35 0.01
## RMSE 4.03 3.55 3.56 0.04
## MAE 1.98 1.92 1.94 0.03
## MPE 1.55 -1.39 -0.03 0.31
## MAPE 10.01 10.33 10.27 2.18
## ACF1 0.08 -0.17 -0.47 0.01
According to accuracy of train values, the best model is ARIMA with the lowest values.
Comparing Forecast
etstr <- accuracy(Fr1,test)[2, 1:5]
tbatstr <- accuracy(tbats_forecast,test)[2, 1:5]
nnetartr <- accuracy(fcast, test)[2, 1:5]
arimatr <- accuracy(f,test)[2, 1:5]
etstr <- as.data.frame(etstr)
tbatstr <- as.data.frame(tbatstr)
nnetartr <- as.data.frame(nnetartr)
arimatr <- as.data.frame(arimatr)
etstr7 <- accuracy(Fr1,test)[2, 7]
tbatstr7 <- accuracy(tbats_forecast,test)[2, 7]
nnetartr7 <- accuracy(fcast, test)[2, 7]
arimatr7 <- accuracy(back,test)[7]
etstr7<- as.data.frame(etstr7)
tbatstr7 <- as.data.frame(tbatstr7)
nnetartr7 <- as.data.frame(nnetartr7)
arimatr7 <- as.data.frame(arimatr7)
acf_test <- cbind(etstr,tbatstr,nnetartr,arimatr)
acf7_test <- cbind(etstr7,tbatstr7,nnetartr7,arimatr7)
colnames(acf7_test) <- c("etstr","tbatstr","nnetartr","arimatr")
accuracytestcompare <- rbind(acf_test, acf7_test)
colnames(accuracytestcompare) <- c("ETS","TBATS","NNETAR","ARIMA")
rownames(accuracytestcompare) <- c('ME', 'RMSE', 'MAE', 'MPE', 'MAPE', 'ACF1')
round(accuracytestcompare,2)
## ETS TBATS NNETAR ARIMA
## ME -18.23 -231.99 26.35 182.75
## RMSE 30.34 299.18 54.59 189.58
## MAE 23.29 231.99 45.56 182.75
## MPE -12.64 -112.82 7.85 98.83
## MAPE 15.25 112.82 22.72 98.83
## ACF1 0.50 0.69 0.77 6.47
According to accuracy of forecasting values, the best model is ETS with lowest RMSE and AICF.
clrs <- c("green", "red", "blue")
autoplot(Fr1) +
autolayer(Fr1$mean, series = "Forecast", size = 1) +
autolayer(fitted(fitnew), series = "Fitted", size = 1) +
autolayer(datap, series ="Data", size = 1) +
ylab("") +
scale_color_manual(values = clrs ) + geom_vline(xintercept = 2023+(01-1)/12)